Gather

沿着给定的轴 axis，根据 indices 张量提供的索引值，从 input 张量中收集数据。支持 batch_dims 指定的批处理维度，即在前 batch_dims 个维度上，索引和输入是对应的。

沿 axis 轴，用 indices 中的索引值从 input 中选取数据：

\[\text{output}[\dots, j, \dots] = \text{input}[\dots, \text{indices}[\dots, j, \dots], \dots]\]

其中 indices 提供 axis 维度上的索引，其余维度与 input 保持一致。

输入：

input - 输入源张量数据地址。
output - 输出数据地址。
indices - 索引张量数据地址（int32 类型）。
params - 参数打包成数组（6个元素），格式如下：
- params[0]: input_shape 地址（输入张量的形状数组）
- params[1]: input_ndim（输入张量的维度数量）
- params[2]: indices_shape 地址（索引张量的形状数组）
- params[3]: indices_ndim（索引张量的维度数量）
- params[4]: axis（沿着哪个轴进行聚集操作）
- params[5]: batch_dims（批处理维度数量）
core_mask(int, 可选) - 核掩码（仅适用于共享存储版本）。

输出：

output - 聚集后的计算结果。

支持平台：

FT78NE MT7004

备注

FT78NE 支持 int8, int16, int32, fp32, fp64, cplx64, cplx128
MT7004 支持 fp16, fp32, int16, int32, cplx64
索引张量 indices 内部存储的索引值必须在 [0, input_shape[axis]) 范围内，否则行为未定义。
聚集操作涉及非连续访存，在大规模数据下建议使用共享存储版本并行处理。

共享存储版本:

void i8_gather_s(int8_t *input, int8_t *output, int32_t *indices, long long *params, int core_mask)

void i16_gather_s(int16_t *input, int16_t *output, int32_t *indices, long long *params, int core_mask)

void i32_gather_s(int32_t *input, int32_t *output, int32_t *indices, long long *params, int core_mask)

void hp_gather_s(half *input, half *output, int32_t *indices, long long *params, int core_mask)

void fp_gather_s(float *input, float *output, int32_t *indices, long long *params, int core_mask)

void dp_gather_s(double *input, double *output, int32_t *indices, long long *params, int core_mask)

void c64_gather_s(float *input, float *output, int32_t *indices, long long *params, int core_mask)

void c128_gather_s(double *input, double *output, int32_t *indices, long long *params, int core_mask)

C调用示例：

// FT78NE 示例：多核并行聚集操作
#include <stdio.h>
#include "78NE/utils.h"

int main() {
    float *input = (float *)0xA0000000;
    int32_t *indices = (int32_t *)0xB0000000;
    float *output = (float *)0xC0000000;
    int input_shape[] = {16, 800, 80};
    int indices_shape[] = {16, 400};
    int input_ndim = 3;
    int indices_ndim = 2;
    int axis = 1;
    int batch_dims = 1;
    int core_mask = 0xFF;

    long long params[6];
    params[0] = (long long)input_shape;
    params[1] = (long long)input_ndim;
    params[2] = (long long)indices_shape;
    params[3] = (long long)indices_ndim;
    params[4] = (long long)axis;
    params[5] = (long long)batch_dims;

    fp_gather_s(input, output, indices, params, core_mask);
    return 0;
}

私有存储版本:

void i8_gather_p(int8_t *input, int8_t *output, int32_t *indices, long long *params)

void i16_gather_p(int16_t *input, int16_t *output, int32_t *indices, long long *params)

void i32_gather_p(int32_t *input, int32_t *output, int32_t *indices, long long *params)

void hp_gather_p(half *input, half *output, int32_t *indices, long long *params)

void fp_gather_p(float *input, float *output, int32_t *indices, long long *params)

void dp_gather_p(double *input, double *output, int32_t *indices, long long *params)

void c64_gather_p(float *input, float *output, int32_t *indices, long long *params)

void c128_gather_p(double *input, double *output, int32_t *indices, long long *params)

C调用示例：

// FT78NE 示例：单核聚集操作
#include <stdio.h>
#include "78NE/utils.h"

int main() {
    float *input = (float *)0x10000000;
    int32_t *indices = (int32_t *)0x10010000;
    float *output = (float *)0x10020000;
    int input_shape[] = {2, 100, 10};
    int indices_shape[] = {50};
    int input_ndim = 2;
    int indices_ndim = 1;
    int axis = 1;
    int batch_dims = 0;

    long long params[6];
    params[0] = (long long)input_shape;
    params[1] = (long long)input_ndim;
    params[2] = (long long)indices_shape;
    params[3] = (long long)indices_ndim;
    params[4] = (long long)axis;
    params[5] = (long long)batch_dims;

    fp_gather_p(input, output, indices, params);
    return 0;
}